Police Brutatlity Data Exploration top of page

Contributor(s): Jessica Marx

Date: 12 June 2020

Code: GitHub

Data: Sourced from this Google Sheet and this census data from Kaggle

The following is not meant to be a source of truth in terms of data integrity, but rather to illustrate different ways of displaying, engineering, and/or presenting the data.

# load packages, install if needed
packages = c(
      "dplyr"
    , "ggplot2"
    , "formattable"
    , "plotly"
    , "RColorBrewer"
    , "scales"
    , "stringr"
    , "tidyr"
    , "ElmeR"
    , "RJDBC"
    , "kableExtra"
    , "wesanderson"
    , "reshape2"
    , "rtweet"
    , "tidytext"
    , "lubridate"
    , "wordcloud"
    , "ggpubr"
    , "ggthemes"
    , "knitrBootstrap"
    , "DT"
    , "MatchIt"
    , "beyonce"
    , "UpSetR"
    , "gganimate"
    , "wordcloud2"
    , "widyr"
    , "ggraph"
    , "igraph"
    , "aod"
    , "corrplot"
    , "ROCR"
    , "InformationValue"
    , "car"
    , "glmnet"
    , "caret"
    , "kernlab"
    , "pdp"
    , "rpart.plot"
    , "rpart"
    , "e1071"
    )
package.check <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x, dependencies = TRUE)
    library(x, character.only = TRUE)
  }
})
#import the data
library(readr)
us_county_stats <- read_csv("datasets_579969_1220276_us_county.csv")
brutality_cases <- read_csv("police_brutality.csv")
# glimpse the data
# us_county_stats %>% 
#   head() 
# 
# brutality_cases %>% 
#   head()

Columns and percent of rows with missing fields from Brutality dataset.

#missing values
colMeans(is.na(brutality_cases))
                                          Timestamp 
                                       1.0000000000 
                       Date of the Police Brutality 
                                       0.0000000000 
           Approximate Time of the Police Brutality 
                                       1.0000000000 
          State where the Police Brutality Occurred 
                                       0.0000000000 
           City where the Police Brutality Occurred 
                                       0.0007829832 
                  Video URL of the Police Brutality 
                                       0.4519117839 
                          Names of Victims Involved 
                                       0.0000000000 
                  Names of Police Officers Involved 
                                       1.0000000000 
          Badge Numbers of Police Officers involved 
                                       1.0000000000 
Enter any extra information you want to share here. 
                                       1.0000000000 
                                           Category 
                                       1.0000000000 
                                               test 
                                       1.0000000000 
                                             County 
                                       0.0000000000 
                                         Category_1 
                                       1.0000000000 

Summary of US census dataset.

summary(us_county_stats)
      fips          county             state          
 Min.   : 1001   Length:3220        Length:3220       
 1st Qu.:19032   Class :character   Class :character  
 Median :30024   Mode  :character   Mode  :character  
 Mean   :31394                                        
 3rd Qu.:46106                                        
 Max.   :72153                                        
  state_code             male             female       
 Length:3220        Min.   :     38   Min.   :     37  
 Class :character   1st Qu.:   5658   1st Qu.:   5573  
 Mode  :character   Median :  12916   Median :  12996  
                    Mean   :  49875   Mean   :  51457  
                    3rd Qu.:  33248   3rd Qu.:  33531  
                    Max.   :4976788   Max.   :5121264  
   median_age      population       female_percentage
 Min.   :21.70   Min.   :      75   Min.   :21.00    
 1st Qu.:38.10   1st Qu.:   11214   1st Qu.:49.43    
 Median :41.20   Median :   25950   Median :50.42    
 Mean   :41.28   Mean   :  101332   Mean   :49.96    
 3rd Qu.:44.30   3rd Qu.:   66552   3rd Qu.:51.15    
 Max.   :67.00   Max.   :10098052   Max.   :58.61    
      lat             long        
 Min.   :17.98   Min.   :-164.03  
 1st Qu.:34.35   1st Qu.: -98.09  
 Median :38.21   Median : -89.95  
 Mean   :37.97   Mean   : -91.64  
 3rd Qu.:41.69   3rd Qu.: -82.99  
 Max.   :69.31   Max.   : -65.29  
# Clean dataset (drop nulls)
brutality_clean = brutality_cases
# Remove columns with more than 50% NA
brutality_clean = brutality_clean[, which(colMeans(!is.na(brutality_clean)) > 0.5)]
# Change columns to snake case
snake_case <- function(x) {
  colnames(x) <- gsub(" ", "_", colnames(x));x
  colnames(x) <-tolower(colnames(x))
}
colnames(brutality_clean) = snake_case(brutality_clean)
brutality_clean = brutality_clean %>% 
  rename(
   "state" = state_where_the_police_brutality_occurred
   , "city" = city_where_the_police_brutality_occurred
   , "date" = date_of_the_police_brutality
  ) %>% 
  mutate(
    state_county = paste0(state, "-", county)
  )
brutality_clean$date = as.Date(brutality_clean$date, "%m/%d/%Y")
us_stats_clean = us_county_stats
us_stats_clean$county = str_replace(us_stats_clean$county, " County", "")
us_stats_clean = us_stats_clean %>% 
  group_by(state) %>% 
  mutate(state_pop = sum(population)) %>% 
  rename(
    "state_full" = state
    , "state" = state_code
    ) %>% 
  ungroup() 
# us_stats_clean = 
  
us_stats_state = us_stats_clean %>% 
  mutate(state_county = paste0(state, "-", county)) %>% 
  select(state_pop, state, state_full) %>% 
  unique()
df_merged = brutality_clean %>% 
  left_join(us_stats_state) %>% 
  mutate(state_pop = ifelse(state == "DC", 705749, state_pop)
  ) %>% 
  unique()
df_merged = df_merged %>% 
  group_by(
    state
  ) %>% 
  add_tally() %>% 
  ungroup() %>% 
  rename(
    "state_totals" = n
  ) %>% 
  mutate(
    state_pop_millions = state_pop/1000000
    , totals_per_million = state_totals/state_pop_millions
  ) 

Incidents by State

df_plot = df_merged %>% 
  select(
    state, state_pop, state_full, state_pop_millions, state_totals, totals_per_million
  ) %>% 
  unique() %>% 
  arrange(desc(totals_per_million)) %>% 
  mutate(
    state = reorder(as.factor(state), totals_per_million)
  ) 
a = df_plot %>% 
  plot_ly(
    y = ~state
    , x = ~totals_per_million
    , type = "bar"
    , marker = list(color = "#35A7FF")
    , orientation = "h"
    , width = 900
    , height = 1000
    , hoverinfo = "text"
    , text = ~paste(
      "State: ", state_full
      , "<br>State Population: ", comma(state_pop)
      , "<br>Reported Incidents: ", state_totals
      , "<br>Incidents per Million Residents: ", comma(totals_per_million)
    )
  ) %>% 
  layout(
    xaxis = list(title = "Reported Incidents per Million State Residents")
    , yaxis = list(title = "")
  )
b = df_plot %>% 
  plot_ly(
    y = ~state
    , x = ~state_totals
    , type = "bar"
    , marker = list(color = "#35A7FF")
    , orientation = "h"
    , width = 900
    , height = 1000
    , hoverinfo = "text"
    , showlegend = FALSE
    , text = ~paste(
      "State: ", state_full
      , "<br>State Population: ", comma(state_pop)
      , "<br>Reported Incidents: ", state_totals
      , "<br>Incidents per Million Residents: ", comma(totals_per_million)
    )
  ) %>% 
  layout(
    xaxis = list(title = "Total Reported Incidents")
    , yaxis = list(title = "")
  )
subplot(a, b, titleX = TRUE)

Excluding counties with null populations

There are some counties with null values when it comes to population. We can obviously find these via a better dataset, but for now I’m going to exclude them in order to provide examples with the data that we have. Obviously, this is not accurate reporting.

County Incidents per Capita

df_filtered = df_merged
df_filtered = df_filtered %>% 
  left_join(us_stats_clean) %>% 
  filter(!is.na(population)) 
df_filtered = df_filtered %>% 
  group_by(state, county) %>% 
  add_tally() %>% 
  rename("county_totals" = n) %>% 
  arrange(desc(county_totals)) %>% 
  mutate("incidents_per_capita" = county_totals/(population)) %>% 
  ungroup() 
df_county = df_filtered %>% 
  select(state, state_county, county, state_full, state_totals, median_age, population, county_totals, incidents_per_capita) %>% 
  unique() %>% 
  arrange(desc(incidents_per_capita))
df_county %>% 
  mutate(
    incidents_per_capita = comma(incidents_per_capita, .000001)
    , population = comma(population)
    ) %>% 
  datatable()
---
output:
  html_notebook:
    code_folding: hide
    toc: false
    toc_float: true
    toc_depth: 5
    number_sections: false
    
---
<style type="text/css">

body {
  font-size: 12pt;
  font-family: "Arial", sans-serif;
}

th {
    background-color: #35A7FF;
    color: black;
    font-size: 10pt;
    font-family: "Arial", sans-serif;
    text-align: left;
    <!-- margin-left: auto; -->
    <!-- margin-right: auto; -->
    <!-- padding-top: 25px; -->
  }

td {  /* Table  */
  font-size: 10pt;
  <!-- text-align: center; -->
  font-family: "Arial", sans-serif;
  <!-- padding-top: 25px; -->
}

h1 {
  font-size: 16pt;
  font-family: "Arial", sans-serif;
}
  
h2 {
  font-size: 12pt;
  font-family: "Arial", sans-serif;
  color: #1f78b4;
  font-family: "Arial", sans-serif;
}

h3 {
  font-size: 12pt;
  font-family: "Arial", sans-serif;
  }
  
h4 {
  font-size: 12pt;
  font-family: "Arial", sans-serif;
}
h5 {
  font-size: 12pt;
  font-family: "Arial", sans-serif;
}
a {
  color: #35A7FF;
  font-size: 12pt;
  font-family: "Arial", sans-serif;
}


.sidenav {
  height: 100%;
  width: 200px;
  position: fixed;
  z-index: 1;
  top: 0;
  left: 0;
  background-color: #38618C;
  overflow-x: hidden;
  padding-top: 20px
  padding-left: 20px;
}

.sidenav a {
  padding: 6px 8px 6px 16px;
  text-decoration: none;
  font-size: 16pt;
  font-weight: bolder;
  font-family: "Arial", sans-serif;
  color: #FFFFFF;
  display: block;
  text-align: center;
}

.center {
  display: block;
  margin-left: auto;
  margin-right: auto;
  width: 100%;
}

.sidenav a:hover {
  color: #f1f1f1;
}

.main {
  margin-left: 200px; /* Same as the width of the sidenav */

}
.main-container {
  max-width: 1400px;
  margin-left: auto;
  margin-right: auto;
  padding: 25px
}
  /*padding: 0px 5px; */
}

@media screen and (max-height: 450px) {
  .sidenav {padding-top: 15px;}
  .sidenav a {font-size: 18px;}  
}
</style>

<!-- TITLE INFO  -->

<div class="sidenav">
  <img src="grassroots_law.png" alt="" width=100px class="center"/>
  <a href="#about">Police Brutatlity Data Exploration</a>
  <a href="#top"> <font face="Arial" size="2" color= "#35A7FF"> top of page </font></a>
</div>

<!-- CONTENT STARTS HERE  -->

<div class="main">
<div class="body">

## __Contributor(s):__ Jessica Marx
## __Date:__ `r format(Sys.time(), "%d %B %Y")`
## __Code:__ <a href="https://github.com/mzagainova/pb-dashboard/tree/master/widgets" target="_blank">GitHub</a>
## <b>Data:</b> Sourced from this <a href="https://docs.google.com/spreadsheets/d/11wLWM957wkxRIyK7EFW-So9jqGhvQxcZ83ipm1kVEnQ/edit#gid=916411806" target="_blank">Google Sheet</a> and this census data from <a href="https://www.kaggle.com/headsortails/covid19-us-county-jhu-data-demographics?select=us_county.csv" target="_blank">Kaggle</a>

The following is not meant to be a source of truth in terms of data integrity, but rather to illustrate different ways of displaying, engineering, and/or presenting the data. 

```{r setup, include=FALSE}

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

```


```{r package, message = FALSE, warning = FALSE}
# load packages, install if needed
packages = c(
      "dplyr"
    , "ggplot2"
    , "formattable"
    , "plotly"
    , "RColorBrewer"
    , "scales"
    , "stringr"
    , "tidyr"
    , "ElmeR"
    , "RJDBC"
    , "kableExtra"
    , "wesanderson"
    , "reshape2"
    , "rtweet"
    , "tidytext"
    , "lubridate"
    , "wordcloud"
    , "ggpubr"
    , "ggthemes"
    , "knitrBootstrap"
    , "DT"
    , "MatchIt"
    , "beyonce"
    , "UpSetR"
    , "gganimate"
    , "wordcloud2"
    , "widyr"
    , "ggraph"
    , "igraph"
    , "aod"
    , "corrplot"
    , "ROCR"
    , "InformationValue"
    , "car"
    , "glmnet"
    , "caret"
    , "kernlab"
    , "pdp"
    , "rpart.plot"
    , "rpart"
    , "e1071"
    )

package.check <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x, dependencies = TRUE)
    library(x, character.only = TRUE)
  }
})
```


```{r functions, echo=FALSE}

#functions!

#round 
round.to <- function(x, b) {
  round(x/b)*b
}

#odds to probability
odds.to.prob <- function(odds) {
  odds/(1 + odds) 
}

#log odds to probability 
logit2prob <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}

#convert to a range
range01 <- function(x){
  (x-min(x))/(max(x)-min(x))
}

#function to get vector of color values from RColorBrewer
get_hex_values <- function(pal) {
  brewer.pal(brewer.pal.info[pal, "maxcolors"], pal)
}
paired_cols <- get_hex_values(pal = "Paired")

#paired palette with brighter yellow (use this for divisional color mapping)
paired_better <- replace(paired_cols, paired_cols == "#FFFF99", "#fed976")

#round to nearest 5, 10, etc 
round_nearest = function(x, base) {
  base*round(x/base)
}

left = function(text, num_char) {
  substr(text, 1, num_char)
}

mid = function(text, start_num, num_char) {
  substr(text, start_num, start_num + num_char - 1)
}

right = function(text, num_char) {
  substr(text, nchar(text) - (num_char-1), nchar(text))
}

#turn values outside of IQR to NA
outlierreplacement <- function(dataframe){
  dataframe %>%          
           map_if(is.numeric, ~ replace(.x, .x %in% boxplot.stats(.x)$out, NA)) %>%
           dplyr::bind_cols() 
}

#evaluate model
eval_metrics = function(model, df, predictions, target){
  resids = df[,target] - predictions
  resids2 = resids**2
  N = length(predictions)
  r2 = as.character(round(summary(model)$r.squared, 2))
  adj_r2 = as.character(round(summary(model)$adj.r.squared, 2))
  print(adj_r2) #Adjusted R-squared
  print(as.character(round(sqrt(sum(resids2)/N), 2))) #RMSE
}

```

```{r}

#import the data
library(readr)
us_county_stats <- read_csv("datasets_579969_1220276_us_county.csv")
brutality_cases <- read_csv("police_brutality.csv")

```


```{r}

# glimpse the data
# us_county_stats %>% 
#   head() 
# 
# brutality_cases %>% 
#   head()

```

## Columns and percent of rows with missing fields from Brutality dataset. 
```{r}

#missing values
colMeans(is.na(brutality_cases))

```

## Summary of US census dataset. 

```{r}

summary(us_county_stats)

```

```{r}

# Clean dataset (drop nulls)
brutality_clean = brutality_cases

# Remove columns with more than 50% NA
brutality_clean = brutality_clean[, which(colMeans(!is.na(brutality_clean)) > 0.5)]

# Change columns to snake case

snake_case <- function(x) {
  colnames(x) <- gsub(" ", "_", colnames(x));x
  colnames(x) <-tolower(colnames(x))
}

colnames(brutality_clean) = snake_case(brutality_clean)

brutality_clean = brutality_clean %>% 
  rename(
   "state" = state_where_the_police_brutality_occurred
   , "city" = city_where_the_police_brutality_occurred
   , "date" = date_of_the_police_brutality
  ) %>% 
  mutate(
    state_county = paste0(state, "-", county)
  )

brutality_clean$date = as.Date(brutality_clean$date, "%m/%d/%Y")

```

```{r}

us_stats_clean = us_county_stats

us_stats_clean$county = str_replace(us_stats_clean$county, " County", "")

us_stats_clean = us_stats_clean %>% 
  group_by(state) %>% 
  mutate(state_pop = sum(population)) %>% 
  rename(
    "state_full" = state
    , "state" = state_code
    ) %>% 
  ungroup() 

# us_stats_clean = 
  
us_stats_state = us_stats_clean %>% 
  mutate(state_county = paste0(state, "-", county)) %>% 
  select(state_pop, state, state_full) %>% 
  unique()

```


```{r, message=FALSE}

df_merged = brutality_clean %>% 
  left_join(us_stats_state) %>% 
  mutate(state_pop = ifelse(state == "DC", 705749, state_pop)
  ) %>% 
  unique()

df_merged = df_merged %>% 
  group_by(
    state
  ) %>% 
  add_tally() %>% 
  ungroup() %>% 
  rename(
    "state_totals" = n
  ) %>% 
  mutate(
    state_pop_millions = state_pop/1000000
    , totals_per_million = state_totals/state_pop_millions
  ) 

```

## Incidents by State

```{r}

df_plot = df_merged %>% 
  select(
    state, state_pop, state_full, state_pop_millions, state_totals, totals_per_million
  ) %>% 
  unique() %>% 
  arrange(desc(totals_per_million)) %>% 
  mutate(
    state = reorder(as.factor(state), totals_per_million)
  ) 

a = df_plot %>% 
  plot_ly(
    y = ~state
    , x = ~totals_per_million
    , type = "bar"
    , marker = list(color = "#35A7FF")
    , orientation = "h"
    , width = 900
    , height = 1000
    , hoverinfo = "text"
    , text = ~paste(
      "State: ", state_full
      , "<br>State Population: ", comma(state_pop)
      , "<br>Reported Incidents: ", state_totals
      , "<br>Incidents per Million Residents: ", comma(totals_per_million)
    )
  ) %>% 
  layout(
    xaxis = list(title = "Reported Incidents per Million State Residents")
    , yaxis = list(title = "")
  )

b = df_plot %>% 
  plot_ly(
    y = ~state
    , x = ~state_totals
    , type = "bar"
    , marker = list(color = "#35A7FF")
    , orientation = "h"
    , width = 900
    , height = 1000
    , hoverinfo = "text"
    , showlegend = FALSE
    , text = ~paste(
      "State: ", state_full
      , "<br>State Population: ", comma(state_pop)
      , "<br>Reported Incidents: ", state_totals
      , "<br>Incidents per Million Residents: ", comma(totals_per_million)
    )
  ) %>% 
  layout(
    xaxis = list(title = "Total Reported Incidents")
    , yaxis = list(title = "")
  )

subplot(a, b, titleX = TRUE)

```

### _Excluding counties with null populations_
There are some counties with null values when it comes to population. We can obviously find these via a better dataset, but for now I'm going to exclude them in order to provide examples with the data that we have. _Obviously, this is not accurate reporting._

## County Incidents per Capita 

```{r}

df_filtered = df_merged

df_filtered = df_filtered %>% 
  left_join(us_stats_clean) %>% 
  filter(!is.na(population)) 

df_filtered = df_filtered %>% 
  group_by(state, county) %>% 
  add_tally() %>% 
  rename("county_totals" = n) %>% 
  arrange(desc(county_totals)) %>% 
  mutate("incidents_per_capita" = county_totals/(population)) %>% 
  ungroup() 

df_county = df_filtered %>% 
  select(state, state_county, county, state_full, state_totals, median_age, population, county_totals, incidents_per_capita) %>% 
  unique() %>% 
  arrange(desc(incidents_per_capita))

df_county %>% 
  mutate(
    incidents_per_capita = comma(incidents_per_capita, .000001)
    , population = comma(population)
    ) %>% 
  datatable()


```

